This notebook tries to investigate the Crunchbase 2013 Snapshot dataset to extract information about which features distinguish the succesful startups from the failed ones. The dataset is composed of 11 CSV files:
- Objects: table with information about the companies, persons and products.
- Offices: information about physical offices of the companies.
- Relationships: information about the relationships between people and companies.
- Investments: connects the investor companies with the invested companies.
- Funding rounds: information about the funding rounds of the companies.
- Acquisitions: table with the information about the acquisition of companies from other comapnies.
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
from pathlib import Path
#jupyter nbconvert src/startups_study.ipynb --no-input --to html
# notebook config
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option("display.precision", 3)
pd.set_option("display.expand_frame_repr", False)
plotly.offline.init_notebook_mode()
Read the CSV files, drop unnecesary columns and convert date to datetypes. Clean the data in case its necessary
# read the data files
data_dir = Path('../data/startup-investments/')
objects = pd.read_csv(data_dir / 'objects.csv')
# main dataframe with all the startups, persons, financial_orgs, etc
objects.drop(
[
'normalized_name', 'permalink',
'created_at', 'updated_at', 'first_investment_at',
'last_investment_at', 'created_by', 'domain',
'twitter_username', 'logo_url', 'logo_width',
'logo_height', 'overview', 'parent_id'
],
inplace=True,
axis=1,
)
# reconvert datatypes
date_columns = [
'founded_at', 'closed_at',
'first_funding_at', 'last_funding_at',
'first_milestone_at'
]
objects[date_columns] = objects[date_columns].apply(pd.to_datetime, errors='coerce')
# objects.head(1)
acquisitions = pd.read_csv(data_dir / 'acquisitions.csv')
# record of companies aquiring other companies, prices and dates
acquisitions.drop(
[
'id', 'acquisition_id', 'created_at',
'updated_at', 'source_url', 'source_description'
],
inplace=True,
axis=1,
)
acquisitions['acquired_at'] = acquisitions['acquired_at'].apply(pd.to_datetime, errors='coerce')
# acquisitions.head(1)
funding_rounds = pd.read_csv(data_dir / 'funding_rounds.csv')
# info about funding of companies
funding_rounds.drop(
[
'created_at', 'updated_at', 'created_by',
'raised_amount', 'raised_currency_code',
'pre_money_valuation', 'post_money_valuation_usd',
'pre_money_currency_code', 'post_money_valuation',
'pre_money_valuation_usd', 'post_money_currency_code',
'source_url'
],
inplace=True,
axis=1,
)
funding_rounds['funded_at'] = funding_rounds['funded_at'].apply(pd.to_datetime, errors='coerce')
# funding_rounds.head(2)
funds = pd.read_csv(data_dir / 'funds.csv')
# funds information
funds.drop(
[
'id', 'created_at', 'updated_at',
'source_url', 'source_description'
],
inplace=True,
axis=1,
)
funds['funded_at'] = funds['funded_at'].apply(pd.to_datetime, errors='coerce')
# funds.head(1)
investments = pd.read_csv(data_dir / 'investments.csv')
# relate investors with invested companies and
investments.drop(
[
'id', 'created_at', 'updated_at',
],
inplace=True,
axis=1,
)
# investments.head(1)
ipos = pd.read_csv(data_dir / 'ipos.csv')
# info about ipos
ipos.drop(
[
'id', 'created_at', 'updated_at',
'stock_symbol', 'source_url', 'source_description'
],
inplace=True,
axis=1,
)
ipos['public_at'] = ipos['public_at'].apply(pd.to_datetime, errors='coerce')
# ipos.head(1)
milestones = pd.read_csv(data_dir / 'milestones.csv')
milestones.drop(
[
'id', 'created_at', 'updated_at',
'source_url', 'milestone_code'
],
inplace=True,
axis=1,
)
milestones['milestone_at'] = milestones['milestone_at'].apply(pd.to_datetime, errors='coerce')
# milestones.head(1)
offices = pd.read_csv(data_dir / 'offices.csv')
# physical location of companies offices
offices.drop(
[
'id', 'zip_code', 'created_at',
'updated_at'
],
inplace=True,
axis=1,
)
# offices.head(1)
people = pd.read_csv(data_dir / 'people.csv')
people.drop(
[
'id',
],
inplace=True,
axis=1,
)
# people.head(1)
relationships = pd.read_csv(data_dir / 'relationships.csv')
relationships.drop(
[
'id', 'sequence', 'created_at',
'updated_at'
],
inplace=True,
axis=1,
)
# relationships.head(1)
Split object table in companies, persons, financial organizations and products.
companies = objects[
(objects['entity_type'] == 'Company') &
(~objects['country_code'].isin(['CSS', 'FST']))
]
# companies.head(1)
financial_org = objects[
(objects['entity_type'] == 'FinancialOrg')
].drop(
[
'closed_at', 'entity_id',
'category_code', 'status', 'funding_rounds',
'funding_total_usd', 'first_funding_at',
'last_funding_at', 'milestones',
'last_milestone_at', 'first_milestone_at'
],
axis=1,
)
# financial_org.head(1)
products = objects[
(objects['entity_type'] == 'Product')
].drop(
[
'region', 'entity_type', 'investment_rounds',
'invested_companies', 'funding_rounds',
'first_funding_at', 'last_funding_at',
'funding_total_usd', 'relationships'
],
axis=1,
)
# products.head(1)
persons = objects[objects['entity_type'] == 'Person']
# persons.head(1)
Create a table with just the companies which appear in the IPO table and add its funding rounds
companies_ipo = pd.merge(
companies.rename(columns={'id': 'object_id'}),
ipos,
on='object_id'
)
companies_ipo.drop(
[
'homepage_url', 'entity_type',
'short_description', 'description', 'tag_list',
'valuation_amount', 'valuation_currency_code',
'raised_amount', 'public_at',
'raised_currency_code', 'ipo_id',
],
axis=1,
inplace=True,
)
# print(companies_ipo.shape)
# print(companies_ipo['status'].value_counts())
# companies_ipo.head(1)
rounds = pd.merge(
companies_ipo.rename(columns={'id': 'object_id'}),
funding_rounds,
on='object_id'
).filter(
[
'object_id', 'funded_at', 'funding_round_id',
'funding_round_type', 'raised_amount_usd',
]
).dropna(subset=['funding_round_type'])
# rounds.head()
# rounds = rounds.groupby(
# [
# 'object_id',
# 'funding_round_type'
# ]
# ).agg(
# # venture=pd.NamedAgg('raised_amount_usd', aggfunc='sum')
# np.sum
# ).unstack().fillna(0)
# rounds.columns = rounds.columns.droplevel()
rounds = pd.pivot_table(
rounds,
index=['object_id'],
columns=['funding_round_type'],
values='raised_amount_usd',
aggfunc='sum',
margins=True,
).fillna(0)
# rounds.head()
companies_ipo = pd.merge(
companies_ipo,
rounds,
on='object_id'
)
# companies_ipo.head(1)
status_dummies = companies_ipo['status'].str.get_dummies()
status_dummies.columns = ['status_' + col for col in status_dummies.columns]
final_df = pd.concat([companies_ipo, status_dummies], axis=1)
# final_df.head(2)
Add number of employees to the company table and expand categorical columns
# get number of employees of each company
employees = relationships.filter(['person_object_id', 'relationship_object_id'])
employees = employees.groupby('relationship_object_id').count().reset_index()
employees.rename({'relationship_object_id': 'id', 'person_object_id': 'employees'}, axis='columns', inplace=True)
# employees.head(2)
# add number of employees as property of the companies
companies = pd.merge(
companies,
employees,
on='id')
# companies.head(1)
def convert_to_usd(data):
conversion_rates = {
'USD': 1,
'EUR': 1.13,
'CAD': 0.78,
'GBP': 1.35,
'JPY': 0.0087,
'AUD': 0.72,
'SEK': 0.11,
}
return round(data['price_amount']*conversion_rates[data['price_currency_code']], 0)
acquisitions = acquisitions[~(
acquisitions['price_currency_code'].isna()
)]
acquisitions['price_amount_in_usd'] = acquisitions.apply(convert_to_usd, axis=1)
# acquisitions.head(2)
# acquisitions.sort_values('price_amount', ascending=False).head()
# acquisitions[acquisitions['acquiring_object_id']=='c:9952']
tmp_df = acquisitions.filter([
'acquiring_object_id','acquired_object_id','price_amount',
'price_currency_code', 'acquired_at', 'price_amount_in_usd'
])
tmp_df.rename(columns={'acquiring_object_id': 'id'}, inplace=True)
# tmp_df = tmp_df.groupby(['id'])['price_amount_in_usd'].sum().reset_index(name='price_amount_in_usd')
tmp_df = tmp_df.groupby(['id', 'price_currency_code'])['price_amount_in_usd'].sum().reset_index(name='spent_in_acquirements')
# print(tmp_df[tmp_df['id']=='c:9952'])
tmp_df.sort_values('spent_in_acquirements').tail(10)
companies = pd.merge(companies, tmp_df, on='id', how='left')
companies['spent_in_acquirements'].fillna(0, inplace=True)
companies.drop_duplicates('id', inplace=True)
for col in ['status', 'category_code', 'country_code', 'state_code', 'city', 'region']:
companies[col] = companies[col].astype('category')
# companies.head(2)
# get pecentage of NaNs
# round(100*(companies.isnull().sum()/len(companies.index)), 2)
companies_clean = companies[~(
companies['category_code'].isnull() |
companies['country_code'].isnull() |
companies['founded_at'].isnull()
)]
# round(100*(companies_clean.isnull().sum()/len(companies_clean.index)), 2)
# group categories together by field
def group_categories(data:pd.DataFrame) -> pd.Series:
tech_cat = [
'software', 'web', 'biotech', 'games_video', 'network_hosting',
'cleantech', 'nanotech', 'ecommerce', 'search', 'social', 'news',
'messaging', 'photo_video', 'music'
]
services_cat = [
'hospitality', 'pets', 'medical', 'health', 'education', 'security',
'travel', 'public_relations',
]
business_cat = [
'advertising', 'enterprise', 'consulting', 'analytics', 'ecommerce',
'finance', 'legal'
]
manufacturing_cat = [
'mobile', 'manufacturing', 'design', 'hardware', 'automotive', 'sports', 'transportation',
'fashion', 'real_estate', 'semiconductor', 'local'
]
other_cat = [
'other',
]
if data['category_code'] in tech_cat:
return 'tech'
elif data['category_code'] in services_cat:
return 'service'
elif data['category_code'] in business_cat:
return 'business'
elif data['category_code'] in manufacturing_cat:
return 'manufacturing'
else:
return 'other'
companies_clean['category_group' ] = companies_clean.apply(group_categories, axis=1)
companies_clean['founding_year'] = companies_clean['founded_at'].dt.year
companies_clean['founding_month'] = companies_clean['founded_at'].dt.month
companies_clean['founding_day'] = companies_clean['founded_at'].dt.day
Plot the data to see how it is distributed, etc
plt.rcParams['figure.figsize'] = 10,10
sns.set_theme(palette='Greys')
Distribution of the status of the companies in the dataset. We can see that the vast mayority of the companies are still operating.
fig = px.pie(
companies.groupby('status')['id'].count().reset_index(),
values='id',
names='status'
)
fig.show()
# pie with companies categories
# aux = companies_clean['category_code'].value_counts()
# aux = pd.DataFrame([aux.index, aux]).transpose()
# aux.columns = ['category', 'count']
# aux['category'] = aux['category'].str.title().str.replace('_', ' ')
# fig = px.pie(
# aux,
# values='count',
# names='category',
# title='Category of the companies',
# )
# fig.update_traces(textposition='inside')
# fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
# fig.show()
Let's explore the most popular categories of the companies in the dataset
# barplot with companies categories
p = sns.catplot(
data = companies_clean.sort_values('category_code'),
x = 'category_code',
kind = 'count',
order = companies_clean['category_code'].value_counts().index,
height = 4,
aspect = 4,
color = 'tab:blue'
# palette = 'Blues'
)
for item in p.ax.get_xticklabels():
item.set_rotation(90)
And the countries with most companies startups:
plt.figure(figsize=(10,4))
sns.barplot(
x=companies_clean['country_code'].value_counts().index[0:10],
y=companies_clean['country_code'].value_counts().values[0:10],
order=companies_clean['country_code'].value_counts().index[0:10],
color = 'tab:blue',
).set(title='Top 10 countries')
[Text(0.5, 1.0, 'Top 10 countries')]
Lets see the founding years of the different companies by category:
categories_by_year = companies_clean[
companies_clean['founding_year'] < 2015
].filter(
['founding_year', 'category_group']
).groupby(['founding_year', 'category_group']).size()
# print(categories_by_year.tail())
plt.figure(figsize=(10,4))
sns.lineplot(
data = categories_by_year,
x = 'founding_year',
y = categories_by_year.values,
hue = 'category_group',
palette='pastel'
).set(title='Category of companies per year')
[Text(0.5, 1.0, 'Category of companies per year')]
Now plot the physical offices of the companies. We can see that the USA and Europe are by far the most densed.
# Plot the physical offices in a map
fig = px.density_mapbox(
offices,
lat='latitude',
lon='longitude',
zoom=2,
height=300,
hover_name='city',
# color_discrete_sequence=['blue'],
hover_data={'description', 'address1', 'object_id'},
radius=5,
)
fig.update_layout(mapbox_style='open-street-map')
fig.update_layout(margin={'r':0,'t':0,'l':0,'b':0})
fig.update_layout(mapbox_center_lat=45, mapbox_center_lon=-25)
fig.show()
Plotting the number of employees for each company we see the mayority of companies have 50 or less employees. The companies with more employees are very succesful ones.
# employees for company
employees = companies_clean.loc[:,('name', 'employees')]
fig = px.scatter(
employees,
x='name',
y='employees',
title='Number of employees in each company',
)
fig.show()
acquirements = companies[companies.status == 'ipo'].loc[:,('name', 'spent_in_acquirements')]
# acquirements = acquirements[acquirements.name != 'General Motors']
acquirements = acquirements[acquirements['spent_in_acquirements']>0]
acquirements.rename(columns={'normalized_name': 'name'}, inplace=True)
fig = px.scatter(
acquirements,
x='name',
y='spent_in_acquirements',
title='',
labels={'spent_in_acquirements': 'Spent in acquirements (USD)', 'name': 'companies'}
)
fig.show()
The correlation matrix shows the relationship of pairs of variables in the dataframe.
# Compute the correlation matrix
companies_clean_expanded = companies_clean.copy()
country_dummies = companies_clean_expanded['country_code'].str.get_dummies()
country_dummies.columns = ['country_' + col for col in country_dummies.columns]
category_dummies = companies_clean_expanded['category_code'].str.get_dummies()
category_dummies.columns = ['category_' + col for col in category_dummies.columns]
status_dummies = companies_clean_expanded['status'].str.get_dummies()
status_dummies.columns = ['status_' + col for col in status_dummies.columns]
# print(companies_clean_expanded.apply(group_categories, axis=1))
category_group_dummies = companies_clean_expanded['category_group'].str.get_dummies()
category_group_dummies.columns = ['cat_group_' + col for col in category_group_dummies.columns]
# companies_clean_expanded = pd.concat([companies_clean_expanded, country_dummies], axis=1)
# companies_clean_expanded = pd.concat([companies_clean_expanded, category_dummies], axis=1)
companies_clean_expanded = pd.concat([companies_clean_expanded, status_dummies], axis=1)
companies_clean_expanded = pd.concat([companies_clean_expanded, category_group_dummies], axis=1)
companies_clean_expanded.drop(['country_code', 'category_code', 'status'], axis=1).head()
corr = companies_clean_expanded.drop(
labels = [
'id', 'name', 'closed_at', 'tag_list', 'state_code',
'city', 'region', 'first_funding_at', 'last_funding_at',
'country_code', 'category_code', 'status', 'entity_id',
'founding_month', 'founding_day', 'founding_year'
],
axis = 1,
).corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
# f, ax = plt.subplots()
# Generate custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap
plt.figure(figsize=(14,14))
sns.heatmap(
corr,
mask = mask,
cmap = cmap,
vmax = .99,
vmin = -.99,
center = 0,
square = True,
linewidths = .5,
cbar_kws = {'shrink': .5},
annot = True,
)
<AxesSubplot:>
Some relations like invested_rounds/invested_companies are obvious and give no usefull information. However we can see how status_ipo is high related to variables like employees which could indicate some interesting information. Also milestones is high related to status_ipo and aqcuired.
# companies.groupby(['country_code', 'status'], as_index=False)[['name']].size().rename(columns={'size':'count'})
#companies.set_index('status').loc['ipo'].groupby(['country_code'], as_index=False)[['name']].size().rename(columns={'size':'count'})
# count_ipos = lambda x: x[]
t = companies.groupby(['country_code'], as_index=True)['status'].value_counts().to_frame().unstack().fillna(0).reset_index()
t.columns = ['country_code', 'acquired', 'closed', 'ipo', 'operating']
# t['ipo_rate'] = t['ipo']/t[~'country_code'].sum()
# t.drop('country_code', axis=1)
t['total'] = t.sum(axis=1, numeric_only= True)
t['ipo_rate'] = t['ipo']/t['total']*100
t['acquired_rate'] = t['acquired']/t['total']*100
t['operating_rate'] = t['operating']/t['total']*100
t['closed_rate'] = t['closed']/t['total']*100
# t.head(1)
fig = px.histogram(
companies_clean,
x='country_code',
color='status',
labels={'country_code': 'countries', 'count': 'number of companies'}
)
fig.show()
fig = px.histogram(
companies_clean[companies_clean['country_code']!='USA'].sort_values(by='country_code'),
x='country_code',
color='status',
title='Companies per country excluding the USA'
)
fig.show()
Let's check the percentage of companies with the IPO status in each country.
# fig = px.scatter(t, x='ipo_rate', hover_data={'country_code'})
fig = px.box(
t,
x='ipo_rate',
hover_data={'country_code'},
points='all',
# notched=True, # used notched shape
)
fig.show()
A lot of countries have 0 companies with the IPO status and the high IPO rates are all from Asian developed countries.
fig = px.histogram(
t[t['ipo']>0].sort_values(by='ipo_rate', ascending=False),
x='country_code',
y='ipo_rate',
hover_data={'country_code'},
)
fig.show()
Let's take a look at the countries with a lot of companies and 0 IPO companies.
# fig = px.scatter(t, x='ipo_rate', hover_data={'country_code'})
fig = px.box(
t[t['ipo'] == 0],
x='total',
hover_data={'country_code'},
points='all',
# notched=True, # used notched shape
)
fig.show()
t[t.ipo==0].sort_values(by='total', ascending=False).head(10)
| country_code | acquired | closed | ipo | operating | total | ipo_rate | acquired_rate | operating_rate | closed_rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 42 | ESP | 19 | 11 | 0 | 704 | 734 | 0.0 | 2.589 | 95.913 | 1.499 |
| 22 | BRA | 10 | 10 | 0 | 400 | 420 | 0.0 | 2.381 | 95.238 | 2.381 |
| 38 | DNK | 14 | 9 | 0 | 221 | 244 | 0.0 | 5.738 | 90.574 | 3.689 |
| 131 | TUR | 2 | 1 | 0 | 225 | 228 | 0.0 | 0.877 | 98.684 | 0.439 |
| 11 | AUT | 7 | 5 | 0 | 165 | 177 | 0.0 | 3.955 | 93.220 | 2.825 |
| 106 | POL | 1 | 2 | 0 | 171 | 174 | 0.0 | 0.575 | 98.276 | 1.149 |
| 54 | HKG | 2 | 8 | 0 | 154 | 164 | 0.0 | 1.220 | 93.902 | 4.878 |
| 100 | NZL | 0 | 5 | 0 | 132 | 137 | 0.0 | 0.000 | 96.350 | 3.650 |
| 135 | UKR | 0 | 0 | 0 | 129 | 129 | 0.0 | 0.000 | 100.000 | 0.000 |
| 114 | ROM | 0 | 2 | 0 | 117 | 119 | 0.0 | 0.000 | 98.319 | 1.681 |
Those countries with a lot of registered startups but not a single one in IPO could mean that those are countries where it is not easy to develop these kind of business at least in the public market.
fig = px.bar(
t[t['ipo']==0].sort_values(by='closed_rate', ascending=False),
# t[t['ipo']==0].sort_values(by='total', ascending=False),
x='country_code',
y=['operating', 'acquired', 'closed'],
hover_name='country_code',
hover_data={'operating_rate', 'acquired_rate', 'closed_rate'},
labels={'value': 'registered companies', 'country_code': 'countries'},
title='Companies in countries with 0 IPO (sorted by closed rate)'
)
fig.show()
# companies.head()
status_sort = {
'operating': 0,
'aqcuired': 1,
'ipo': 2,
'closed': 3
}
fig = px.scatter(
data_frame=companies,
x='investment_rounds',
y='status',
hover_name='name',
marginal_y="histogram",
)
fig.show()
fig = px.scatter(
companies[companies['funding_total_usd']!=0].sort_values(by='status', key=lambda x: x.map(status_sort)),
x='name',
y='funding_total_usd',
color='status',
title='',
labels={'name': 'company', 'funding_total_usd': 'funding (USD)'},
)
fig.show()
fig = px.scatter(
companies[companies['relationships']>20].sort_values(by='status', key=lambda x: x.map(status_sort)),
x='name',
y='relationships',
color='status',
title='',
labels={'name': 'company'}
)
fig.show()
tmp = pd.merge(companies, funding_rounds, left_on='id', right_on='object_id', how='left')
tmp = tmp.groupby(['funding_round_type', 'status'], as_index=True)['name'].count()
tmp = tmp.unstack()
# tmp.columns = ['funding_type', 'acquired', 'closed', 'ipo', 'operating']
tmp['total'] = tmp.sum(axis=1, numeric_only= True)
tmp['acquired_rate'] = tmp['acquired']/tmp['total']
tmp['closed_rate'] = tmp['closed']/tmp['total']
tmp['ipo_rate'] = tmp['ipo']/tmp['total']
tmp['operating_rate'] = tmp['operating']/tmp['total']
print(tmp.index)
tmp.reset_index(inplace=True)
# tmp.head()
Index(['angel', 'crowdfunding', 'other', 'post-ipo', 'private-equity',
'series-a', 'series-b', 'series-c+', 'venture'],
dtype='object', name='funding_round_type')
fig = px.bar(
tmp[tmp['funding_round_type']!='post-ipo'],
x = 'funding_round_type',
y = ['operating_rate', 'acquired_rate', 'closed_rate', 'ipo_rate'],
)
fig.update_layout(barmode='stack', xaxis={
'categoryorder':'array',
'categoryarray':['venture', 'angel', 'series-a', 'series-b', 'series-c+', 'other', 'private-equity', 'crowdfunding']
})
fig.show()
fig = px.scatter_matrix(
companies_clean,
dimensions= ['status', 'category_group', 'employees', 'funding_rounds'],
hover_name='name'
# color=''
)
fig.show()